1 Introduction

Here, we will apply a k-nearest neighbor (KNN) algorithm to classify the scATAC cells to a given cell type category with the help of our training set, the Multiome experiment. Remember, that KNN works on a basic assumption that data points of similar categories are closer to each other.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Signac)
library(flexclust)
library(tidyverse)
library(plyr)
library(harmony)
library(class)
library(ggplot2)
library(reshape2)

2.2 Parameters

cell_type = "CD4_T"

# Paths
path_to_obj <- str_c(
  here::here("scATAC-seq/results/R_objects/level_4/"),
  cell_type,
  "/",
  cell_type,
  "_integrated_level_4.rds",
  sep = ""
)

path_to_obj_RNA <- str_c(
  here::here("scRNA-seq/3-clustering/4-level_4/"),
  cell_type,
    "/",
  cell_type,
  "_integrated_level_4.rds",
  sep = ""
)

path_to_level_4 <- here::here("scATAC-seq/results/R_objects/level_4/CD4_T/")
path_to_save <- str_c(path_to_level_4, "CD4_T_annotated_level_4.rds")

2.3 Variables

reduction <- "harmony"
dims <- 1:40
color_palette<- c("black", "gray", "red", "yellow", "violet", "green4",
                  "blue", "chocolate1", "coral2", "blueviolet",
                  "brown1", "darkmagenta", "deepskyblue1", "dimgray",
                  "deeppink1", "green", "lightgray", "hotpink1",
                  "indianred4", "khaki", "mediumorchid2")

2.4 Load data

We need to load the scRNAseq annotation from Multiome experiment (cell barcode and cell-type assigned) and the integrated scATAC data. Note that there are 221 cells difference between scATAC and scRNA from multiome.

seurat <- readRDS(path_to_obj_RNA)

tonsil_RNA_annotation <- seurat@meta.data %>%
  rownames_to_column(var = "cell_barcode") %>%
  dplyr::filter(assay == "multiome") %>%
  dplyr::select("cell_barcode", "annotation_level_3")
head(tonsil_RNA_annotation)
##                           cell_barcode         annotation_level_3
## 1 co7dzuup_xuczw9vc_AAACATGCAAGCCAGA-1                 CD200+TOX+
## 2 co7dzuup_xuczw9vc_AAACATGCAAGGTATA-1                      Naive
## 3 co7dzuup_xuczw9vc_AAACCGGCATGCTATG-1 Follicular Th CXCL13+CBLB+
## 4 co7dzuup_xuczw9vc_AAACGCGCATTGTGTG-1                      Naive
## 5 co7dzuup_xuczw9vc_AAAGCGGGTTTGGGCG-1       Follicular Th CXCR5+
## 6 co7dzuup_xuczw9vc_AAATGCCTCACCTGTC-1                 CD200+TOX+
DimPlot(seurat,
  group.by = "annotation_level_3",
  cols = color_palette,
  pt.size = 0.1)

seurat_ATAC <- readRDS(path_to_obj)
seurat_ATAC
## An object of class Seurat 
## 270784 features across 19321 samples within 1 assay 
## Active assay: peaks_macs (270784 features, 198233 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
p1 <- DimPlot(seurat_ATAC,
  pt.size = 0.1)
p1

Annotation level 1 for scATAC will be defined “a priori” as unannotated and the scRNA annotation will be transfered to the scATAC-multiome cells based on the same cell barcode.

tonsil_scATAC_df <- data.frame(cell_barcode = colnames(seurat_ATAC)[seurat_ATAC$assay == "scATAC"])
tonsil_scATAC_df$annotation_level_3 <- "unannotated"

df_all <- rbind(tonsil_RNA_annotation,tonsil_scATAC_df)
rownames(df_all) <- df_all$cell_barcode
df_all <- df_all[colnames(seurat_ATAC), ]

seurat_ATAC$annotation_level_3 <- df_all$annotation_level_3
seurat_ATAC@meta.data$annotation_prob  <- 1
melt(table(seurat_ATAC$annotation_level_3))
##                          Var1 value
## 1             activated CD4 T    33
## 2                  CD200+TOX+   238
## 3         CD4 Non-Tfh KLRB1+    467
## 4           Central Mem PASK-   276
## 5           Central Mem PASK+  1233
## 6  Follicular Th CXCL13+CBLB+   720
## 7        Follicular Th CXCR5+  1176
## 8         Follicular Th TOX2+   911
## 9           IL2RA+FOXP3+ Treg   477
## 10             Memory T cells   233
## 11     Mitochondrial+ T cells  1425
## 12                      Naive  2551
## 13          naive Treg IKZF2+   240
## 14                       Th17   340
## 15           Treg IKZF2+HPGD+   155
## 16                unannotated  8846
table(is.na(seurat_ATAC$annotation_level_3))
## 
## FALSE 
## 19321

2.5 General low-dimensionality representation of the assays

DimPlot(seurat_ATAC,
  group.by = "annotation_level_3",
  split.by = "assay",
  cols = color_palette,
  pt.size = 0.5)

3 KNN Algorithm

3.1 Data Splicing

Data splicing basically involves splitting the data set into training and testing data set.

reference_cells <- colnames(seurat_ATAC)[seurat_ATAC$assay == "multiome"]
query_cells <- colnames(seurat_ATAC)[seurat_ATAC$assay == "scATAC"]

reduction_ref <- seurat_ATAC@reductions[[reduction]]@cell.embeddings[reference_cells, dims]
reduction_query <- seurat_ATAC@reductions[[reduction]]@cell.embeddings[query_cells, dims]

3.2 Cross-validation of the K parameter.

We’re going to calculate the number of observations in the training dataset that correspond to the Multiome data. The reason we’re doing this is that we want to initialize the value of ‘K’ in the KNN model. To do that, we split our training data in two part: a train.loan, that correspond to the random selection of the 70% of the training data and the test.loan, that is the remaining 30% of the data set. The first one is used to traint the system while the second is uses to evaluate the learned system.

dat.d <- sample(1:nrow(reduction_ref),
               size=nrow(reduction_ref)*0.7,replace = FALSE) 

train.loan  <- reduction_ref[dat.d,] # 70% training data
test.loan <- reduction_ref[-dat.d,] # remaining 30% test data

train.loan_labels <- seurat_ATAC@meta.data[row.names(train.loan),]$annotation_level_3
test.loan_labels <- seurat_ATAC@meta.data[row.names(test.loan),]$annotation_level_3

k.optm <- c()
k.values <- c()

for (i in 2^(0:8)){
 print(i)
 knn.mod <- knn(train=train.loan, test=test.loan, cl=train.loan_labels, k=i)
 k.optm <- c(k.optm, 100 * sum(test.loan_labels == knn.mod)/NROW(test.loan_labels))
 k.values <- c(k.values,i)
}
## [1] 1
## [1] 2
## [1] 4
## [1] 8
## [1] 16
## [1] 32
## [1] 64
## [1] 128
## [1] 256

Now we can plot the accuracy of the model taking in account a range of different K and selec the best one.

k.optim = data.frame(k.values,k.optm)

p3 <- ggplot(data=k.optim, aes(x=k.values, y=k.optm, group=1)) +
 geom_line() +
 geom_point() + 
 geom_vline(xintercept=16, linetype="dashed", color = "red")

p3

3.3 Building a Machine Learning model with the optimal k value.

train.loan  <- reduction_ref
test.loan <- reduction_query

train.loan_labels <- seurat_ATAC@meta.data[row.names(train.loan),]$annotation_level_3
test.loan_labels <- seurat_ATAC@meta.data[row.names(test.loan),]$annotation_level_3

knn.mod <- knn(train=train.loan, test=test.loan, cl=train.loan_labels, k=16, prob=T)

annotation_data <- data.frame(query_cells, knn.mod, attr(knn.mod,"prob"))
colnames(annotation_data) <- c("query_cells",
                               "annotation_level_3",
                               "annotation_prob")

annotation_data$annotation_level_3 <- as.character(annotation_data$annotation_level_3)
seurat_ATAC@meta.data[annotation_data$query_cells,]$annotation_level_3 <- annotation_data$annotation_level_3
seurat_ATAC@meta.data[annotation_data$query_cells,]$annotation_prob <- annotation_data$annotation_prob
seurat_ATAC$annotation_level_3 <- factor(seurat_ATAC$annotation_level_3)

3.4 Low-dimensionality representation of the assays

DimPlot(
  seurat_ATAC,
  cols = color_palette,
  group.by = "annotation_level_3",
  pt.size = 0.1)

DimPlot(
  cols = color_palette,
  seurat_ATAC, reduction = "umap",
  group.by = "annotation_level_3",
  pt.size = 0.1,  split.by = "assay")

melt(table(seurat_ATAC$annotation_level_3))
##                          Var1 value
## 1             activated CD4 T    33
## 2                  CD200+TOX+   440
## 3         CD4 Non-Tfh KLRB1+    896
## 4           Central Mem PASK-   288
## 5           Central Mem PASK+  1989
## 6  Follicular Th CXCL13+CBLB+  1175
## 7        Follicular Th CXCR5+  2520
## 8         Follicular Th TOX2+  2055
## 9           IL2RA+FOXP3+ Treg   826
## 10             Memory T cells   322
## 11     Mitochondrial+ T cells  2905
## 12                      Naive  4573
## 13          naive Treg IKZF2+   415
## 14                       Th17   650
## 15           Treg IKZF2+HPGD+   234
saveRDS(seurat_ATAC, path_to_save)

3.5 Low-dimensionality representation of the prediction probability

Note that the probability of the prediction was lower in the transitioning cells and in not-defined clusters.

seurat_ATAC_scATAC = subset(seurat_ATAC, assay == "scATAC")

FeaturePlot(
  seurat_ATAC_scATAC, reduction = "umap",
  features = "annotation_prob",
  pt.size = 0.1)

4 Session Information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Tonsil_atlas/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] reshape2_1.4.4    class_7.3-18      harmony_1.0       Rcpp_1.0.5        plyr_1.8.6        forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4       readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      ggplot2_3.3.2     tidyverse_1.3.0   flexclust_1.4-0   modeltools_0.2-23 lattice_0.20-41   Signac_1.1.0.9000 Seurat_3.9.9.9010 BiocStyle_2.16.1 
## 
## loaded via a namespace (and not attached):
##   [1] reticulate_1.18             tidyselect_1.1.0            RSQLite_2.2.1               AnnotationDbi_1.50.3        htmlwidgets_1.5.2           BiocParallel_1.22.0         Rtsne_0.15                  munsell_0.5.0               codetools_0.2-17            ica_1.0-2                   future_1.20.1               miniUI_0.1.1.1              withr_2.3.0                 colorspace_2.0-0            Biobase_2.48.0              OrganismDbi_1.30.0          knitr_1.30                  rstudioapi_0.12             ROCR_1.0-11                 tensor_1.5                  listenv_0.8.0               labeling_0.4.2              GenomeInfoDbData_1.2.3      polyclip_1.10-0             farver_2.0.3                bit64_4.0.5                 rprojroot_2.0.2             parallelly_1.21.0           vctrs_0.3.4                 generics_0.1.0              xfun_0.18                   biovizBase_1.36.0           BiocFileCache_1.12.1        lsa_0.73.2                  ggseqlogo_0.1               R6_2.5.0                    GenomeInfoDb_1.24.0         rsvd_1.0.3                  AnnotationFilter_1.12.0     bitops_1.0-6                spatstat.utils_2.1-0        reshape_0.8.8              
##  [43] DelayedArray_0.14.0         assertthat_0.2.1            promises_1.1.1              scales_1.1.1                nnet_7.3-14                 gtable_0.3.0                globals_0.13.1              goftest_1.2-2               ggbio_1.36.0                ensembldb_2.12.1            rlang_0.4.11                RcppRoll_0.3.0              splines_4.0.3               rtracklayer_1.48.0          lazyeval_0.2.2              dichromat_2.0-0             broom_0.7.2                 checkmate_2.0.0             modelr_0.1.8                BiocManager_1.30.10         yaml_2.2.1                  abind_1.4-5                 GenomicFeatures_1.40.1      backports_1.2.0             httpuv_1.5.4                Hmisc_4.4-1                 RBGL_1.64.0                 tools_4.0.3                 bookdown_0.21               ellipsis_0.3.1              RColorBrewer_1.1-2          BiocGenerics_0.34.0         ggridges_0.5.2              base64enc_0.1-3             progress_1.2.2              zlibbioc_1.34.0             RCurl_1.98-1.2              prettyunits_1.1.1           rpart_4.1-15                openssl_1.4.3               deldir_0.2-3                pbapply_1.4-3              
##  [85] cowplot_1.1.0               S4Vectors_0.26.0            zoo_1.8-8                   haven_2.3.1                 SummarizedExperiment_1.18.1 ggrepel_0.8.2               cluster_2.1.0               here_1.0.1                  fs_1.5.0                    magrittr_1.5                data.table_1.13.2           reprex_0.3.0                lmtest_0.9-38               RANN_2.6.1                  SnowballC_0.7.0             ProtGenerics_1.20.0         fitdistrplus_1.1-1          matrixStats_0.57.0          hms_0.5.3                   patchwork_1.1.0             mime_0.9                    evaluate_0.14               xtable_1.8-4                XML_3.99-0.3                jpeg_0.1-8.1                readxl_1.3.1                IRanges_2.22.1              gridExtra_2.3               compiler_4.0.3              biomaRt_2.44.4              KernSmooth_2.23-17          crayon_1.3.4                htmltools_0.5.1.1           mgcv_1.8-33                 later_1.1.0.1               Formula_1.2-4               lubridate_1.7.9             DBI_1.1.0                   tweenr_1.0.1                dbplyr_1.4.4                MASS_7.3-53                 rappdirs_0.3.1             
## [127] Matrix_1.3-2                cli_2.1.0                   parallel_4.0.3              igraph_1.2.6                GenomicRanges_1.40.0        pkgconfig_2.0.3             GenomicAlignments_1.24.0    foreign_0.8-80              plotly_4.9.2.1              xml2_1.3.2                  XVector_0.28.0              rvest_0.3.6                 VariantAnnotation_1.34.0    digest_0.6.27               sctransform_0.3.1           RcppAnnoy_0.0.16            graph_1.66.0                spatstat.data_2.1-0         Biostrings_2.56.0           cellranger_1.1.0            rmarkdown_2.5               leiden_0.3.5                fastmatch_1.1-0             htmlTable_2.1.0             uwot_0.1.8.9001             curl_4.3                    shiny_1.5.0                 Rsamtools_2.4.0             lifecycle_0.2.0             nlme_3.1-150                jsonlite_1.7.1              fansi_0.4.1                 viridisLite_0.3.0           askpass_1.1                 BSgenome_1.56.0             pillar_1.4.6                GGally_2.0.0                fastmap_1.0.1               httr_1.4.2                  survival_3.2-7              glue_1.4.2                  spatstat_1.64-1            
## [169] png_0.1-7                   bit_4.0.4                   ggforce_0.3.2               stringi_1.5.3               blob_1.2.1                  latticeExtra_0.6-29         memoise_1.1.0               irlba_2.3.3                 future.apply_1.6.0